# encoding=utf-8

from bs4 import BeautifulSoup
import re
import requests
import time
import csv
import codecs
import urllib
# 处理python编码格式
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# ------------------------

# csvFile = open("./star-comment.csv", "wt")
# 处理csv写入时的编码格式
csvFile = open("./star-comment.csv", "ab+")
csvFile.write(codecs.BOM_UTF8)

class Comitem():
    def __init__(self):
        self.star = ""
        self.comment = []
    pass

url_lib = []
headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding':'gzip, deflate, sdch, br',
            'Accept-Language':'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
            'Cache-Control':'max-age=0',
            'Keep-Alive':'timeout=30',
            'Cookie':'''bid=u1cPPsPJ94Q; ll="108288"; viewed="26616244"; gr_user_id=6b4b875c-c8d0-44c8-9eea-307ab11ef5eb; ps=y; ue="huangkenc@126.com"; dbcl2="160222039:LToHUOhNf1g"; ck=bX43; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1492002980%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D3gViGYLduCRJ7TOnXzMUs7M67wYlwj_8YTjC7CLyYxGCyWry7XndlStL7zamcXMP%26wd%3D%26eqid%3Debfab95a00034f3a0000000358ee0bb7%22%5D; __utmt_douban=1; ap=1; ct=y; _vwo_uuid_v2=F38BE1D398836EA7CF7EB0B6678BD51D|be9ece6ba5e729801f71f8e1c0003d96; _pk_id.100001.4cf6=8a32b65e5fb5ef27.1488888910.21.1492004498.1491997361.; _pk_ses.100001.4cf6=*; __utma=30149280.644910076.1476457902.1491995584.1492002980.39; __utmb=30149280.10.10.1492002980; __utmc=30149280; __utmz=30149280.1491995584.38.33.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.16022; __utma=223695111.650469051.1488888910.1491995584.1492002980.23; __utmb=223695111.0.10.1492002980; __utmc=223695111; __utmz=223695111.1491995584.22.17.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; push_noty_num=0; push_doumail_num=0''',
            'Host':'movie.douban.com',
            'Referer':'https://movie.douban.com/subject/26862259/',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent':'''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'''
            }

def login(url):
    global headers
    loginUrl = 'https://www.douban.com/accounts/login?https://movie.douban.com/subject/26862259/comments'
    headers['Referer'] = url
    headers['Connection'] = 'keep-alive'
    formData = {
        "source": 'None',
        "redir": '''https://movie.douban.com/subject/26862259/comments''',
        "form_email": 'huangkenc@126.com',
        "form_password": 'a123456789',
        "login": u'登录'
    }
    r = requests.post(loginUrl, data=formData, headers=headers)
    page = r.text
    # ----获取验证码图片-----
    # 利用bs4获取captcha地址
    soup = BeautifulSoup(page, "html.parser")
    captchaAddr = soup.find('img', id='captcha_image')['src']
    # 利用正则表达式获取captcha的ID
    if captchaAddr != "":
        reCaptchaID = r'<input type="hidden" name="captcha-id" value="(.*?)"/'
        captchaID = re.findall(reCaptchaID, page)
        # print captchaID
        # 保存到本地
        urllib.urlretrieve(captchaAddr, "verify.jpg")
        captcha = raw_input('please input the captcha:')
        formData['captcha-solution'] = captcha
        formData['captcha-id'] = captchaID
        requests.post(loginUrl, data=formData, headers=headers)
        getComment(url)


def getComment(url):
    global headers
    global url_lib
    # ----从链接库中删除已访问过的url
    if url in url_lib:
        url_lib.remove(url)
    # -----------------------
    print "已抓到"+url
    session = requests.Session()
    # 使链接不保持
    session.keep_alive = False
    # 增加重试链接次数
    requests.DEFAULT_RETRIES = 5
    res = session.get(url, headers=headers )
    bsObj = BeautifulSoup(res.text, "html.parser")
    for item in bsObj.findAll(class_="comment"):
        temp = Comitem()
        try:
            star_class = item.find(class_="rating").attrs['class'][0]
            pattern = re.compile(r'[0-9]')
            temp.star = pattern.search(star_class).group()
            temp.comment = item.p.get_text().replace("\n", "").replace("\"", "")
            writer = csv.writer(csvFile)
            writer.writerow((temp.star, temp.comment))
        except:
            print "此页出错"
            pass
    # ---抓取下一页，放入链接池---
    try:
        link = "https://movie.douban.com/subject/26862259/comments"+bsObj.find(class_="next").get('href')
        url_lib.append(link)
    except:
        login(url)

    # ------------------------
    for i in url_lib:
        time.sleep(3)
        getComment(i)
url = "https://movie.douban.com/subject/26862259/comments"
getComment(url)
csvFile.close()

# ---------

